/*******************************************************************************

Name: Table 2 - Summary of violations for system-years (unmatched panel)

Authors: Jonathan Baker, Lori Bennear, Sheila Olmstead

Input files:
	Alternate panel: natlCWS_yr_Panel_mod.dta (main panel)

Notes: the alternate panel does not contain the results of the matching
	   procedure, but does contain individual violations by rule.
		
*******************************************************************************/


* ========================== START START START ==========================
clear
clear matrix
set more off

* ______________________________________________________________________________
* set paths and create log file
global root "C:/Users/jbaker/Documents/Research/SDWIS/Analysis/STATA"
global input "$root/Input"
global intermediate "$root/Intermediate"
global log "$root/LogFiles"
global output "$root/Output"
global outpanel "$root/OutputPanel"
global figures "$root/Figures"

log using "$log/Table2_violationSummaryStatistics.log", replace name(vstats)
set more off

* ============================================================================== 
* table of violations summary characteristics

use "$outpanel/natlCWS_yr_Panel_mod.dta", clear
keep if id2001==1

// creating population-category variables
gen pcat0 = (inrange(population_served_count,0,500)) // population category
gen pcat1 = (inrange(population_served_count,501,9999))
gen pcat2 = (inrange(population_served_count,10000,99999))
gen pcat3 = (population_served_count >= 100000)

// variables for table of violations
gen microb = HLTH_110 + HLTH_121 + HLTH_122 + HLTH_123 + HLTH_130 + HLTH_140
gen dbp = HLTH_210 + HLTH_220 + HLTH_230
gen I_microb = (microb > 0)
gen I_dbp = (dbp > 0)
gen I_TT_999 = (TT_999 > 0)
gen I_MRDL_999 = (MRDL_999 > 0)
gen I_MR_999 = (MR_999 > 0)
gen I_OTH_999 = (OTH_999 > 0)

foreach i of numlist 0/3 {
	gen hviol`i' = HLTH_999 if pcat`i' == 1
	gen I_hviol`i' = (hviol`i' > 0 & hviol`i' !=.)
	replace I_hviol`i' = . if hviol`i' == .
	}
local threshold "501 10k 100k"
foreach i of local threshold {
	gen hv`i' = HLTH_999 if T_`i' == 1
	gen I_hv`i' = (hv`i' > 0 & hv`i' !=.)
	replace I_hv`i' = . if hv`i' == .
	}

mat avgv = J(14,7,.)
set more off
local rows HLTH_999 hviol0 hviol1 hviol2 hviol3 hv501 hv10k ALL_999 MCL_999 TT_999 microb dbp MR_999 OTH_999   
local rcnt = 0
foreach r of local rows {
	local rcnt = `rcnt' + 1
	quietly summarize I_`r'
	mat avgv[`rcnt',6] = r(mean)
	quietly summarize `r' 
	mat avgv[`rcnt',1] = r(sum)
	mat avgv[`rcnt',2] = r(mean)
	mat avgv[`rcnt',3] = r(sd)
	mat avgv[`rcnt',4] = r(min)
	mat avgv[`rcnt',5] = r(max)
	mat avgv[`rcnt',7] = r(N)
	}
mat rownames avgv = ///
"health violations" ///
"systems serving 0-500" ///
"systems serving 501-9,999" ///
"systems serving 10k-99,999" ///
"systems serving >=100k" ///
"Tpub=1(service pop. >500)" ///
"Tmail=1(service pop. >= 10k)" ///
"All violations" ///
"MCL violations" ///
"TT violations" ///
"Microbial violations" ///
"DBP violations" ///
"MR violations" ///
"Other violations"

putexcel set "$output\Table2_violationSummaryStatistics.xlsx", ///
	sheet(violationStats, replace) modify
putexcel A1 = matrix(avgv), rownames nformat(number_d2)
* note("DBP violations are disinfection byproducts, primarily Trihalomethanes.")

* ========================== DONE DONE DONE ==========================
log close vstats
